Loading all required library

library(tidyverse)
## -- Attaching packages --------
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -----------------
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Load our dataset from pre-determind working directory

df <- read_excel("Data Analysis - Data Sheets.xlsx", sheet = "PT & FT Data PivotTable format")
View(df)
glimpse(df)
## Observations: 1,840
## Variables: 6
## $ Cluster   <chr> "Education", "Education", "Education", "Education", "Fami...
## $ Agency    <chr> "Education Agency 1", "Education Agency 2", "Education Ag...
## $ Year      <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 201...
## $ `PT/FT`   <chr> "Full-Time", "Full-Time", "Full-Time", "Full-Time", "Full...
## $ Gender    <chr> "Female", "Female", "Female", "Female", "Female", "Female...
## $ Headcount <dbl> 180, 2463, 32, 39251, 9817, 44, 82, 3205, 308, 76, 58, 83...

Pre-analysis Check

attach(df)
names(df)
## [1] "Cluster"   "Agency"    "Year"      "PT/FT"     "Gender"    "Headcount"
df$Type<- df$`PT/FT`
length(unique(Agency))
## [1] 92
any(is.character(df))
## [1] FALSE
table(df$Cluster)
## 
##                      Education    Family & Community Services 
##                             80                             60 
## Finance, Services & Innovation                         Health 
##                             40                            660 
##                       Industry                        Justice 
##                            160                            280 
##         Planning & Environment              Premier & Cabinet 
##                            160                            220 
##                      Transport                       Treasury 
##                            120                             60
table(df$Type)
## 
## Full-Time Part-Time 
##       920       920
df %>% 
  group_by(Cluster,Year) %>% 
  summarise(Avg.Headcount= mean(Headcount), Max.Headcount= max(Headcount), Min.Headcount= min(Headcount))

Anaysis Part

Trend<- df %>%
  select(Year,Gender, Headcount) %>%
  group_by(Year, Gender) %>%
  summarise(Total_headcount= sum(Headcount))

ggplotly(ggplot(Trend)+
  geom_bar(mapping = aes(Year, Total_headcount, fill=Gender), stat = "Identity",
           position = "dodge")+
  labs(title = "Total headcount by Year"))
ggplot(Trend)+
  geom_smooth(aes(Year,Total_headcount, color= Gender))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2014
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.0804
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2014
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 2.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 4.0804
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2014
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.0804
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2014
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 2.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 4.0804
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

df2<- df %>%
      filter( Type == "Part-Time")
Clus<- df2 %>%
          group_by(Cluster) %>%
          summarise(Total_headcount= sum(Headcount))
Clus_18<- df2 %>%
          filter(Year==2018) %>% 
          group_by(Cluster) %>%
          summarise(Total_headcount= sum(Headcount))


ggplot(Clus_18)+
  geom_bar(mapping = aes(Cluster, Total_headcount),stat="Identity",fill="blue")+
  coord_flip()+
    labs(title = "Total headcount among the clusters of 2018")

ggplot(Clus)+
  geom_bar(mapping = aes(Cluster, Total_headcount),stat="Identity",fill="blue")+
  coord_flip()+
    labs(title = "Total headcount among the clusters of 4 years")

df2 %>% 
  count(Cluster, name = "count") %>% 
  arrange(desc(count)) %>% 
  ggplot()+
  geom_bar(aes(Cluster,count), stat = "identity", fill="darkblue")+
  coord_flip()

df2 %>% 
  group_by(Cluster) %>% 
  summarise(Avg_Headcount = mean(Headcount)) %>% 
  ggplot()+
  geom_bar(aes(Cluster, Avg_Headcount), stat = "identity", fill="darkblue")+
  coord_flip()+
  labs(title = "Average headcount among the clusters of 4 years")

Prop<- df2 %>%
          group_by(Cluster)%>%
          mutate(Prop = Headcount/sum(Headcount)) %>%
          ungroup()

df2 %>%
   group_by(Cluster)%>%
   mutate(Prop = Headcount/sum(Headcount)) %>%
   ungroup() %>%
   ggplot(aes(Year, Prop, color=Gender))+
   geom_smooth(se=F)+
   facet_wrap(~Cluster)+
   scale_y_continuous(labels = scales::percent)+
   labs(title = "Changes of proportions over the years among cluster and gender of Part time workers")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

df2 %>%
   ggplot(aes(Year, Headcount, color=Gender))+
   geom_smooth(se=F)+
   facet_wrap(~Cluster)+
   labs(title = "Changes of headcounts over the years among clusters and gender of Part time workers")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Projecting Number of headcount in 2025

df-> df_mod
df_mod$Cluster<- factor(df_mod$Cluster)
df_mod$Gender<- factor(df_mod$Gender)
df_mod$Type<- factor(df_mod$Type)
df_mod<- df_mod %>% 
  select(-Agency, -'PT/FT')

set.seed(1234)

library(caTools)
sample<- sample.split(df_mod$Headcount, SplitRatio = 0.8)
test<- subset(df_mod, sample == TRUE)
train<- subset(df_mod, sample == FALSE)

model<- lm(Headcount~., data = test)
summary(model)
## 
## Call:
## lm(formula = Headcount ~ ., data = test)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7852   -975   -235    329  33639 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            -5122.359 100263.841  -0.051    0.959
## ClusterFamily & Community Services     -5386.357    512.562 -10.509  < 2e-16
## ClusterFinance, Services & Innovation  -5940.460    562.284 -10.565  < 2e-16
## ClusterHealth                          -5825.118    346.265 -16.823  < 2e-16
## ClusterIndustry                        -6378.582    403.585 -15.805  < 2e-16
## ClusterJustice                         -6100.311    372.517 -16.376  < 2e-16
## ClusterPlanning & Environment          -6657.241    407.763 -16.326  < 2e-16
## ClusterPremier & Cabinet               -6971.072    387.218 -18.003  < 2e-16
## ClusterTransport                       -5682.999    424.588 -13.385  < 2e-16
## ClusterTreasury                        -6863.252    512.603 -13.389  < 2e-16
## Year                                       6.444     49.735   0.130    0.897
## GenderMale                              -686.043    140.593  -4.880 1.17e-06
## TypePart-Time                          -1034.712    140.656  -7.356 3.01e-13
##                                          
## (Intercept)                              
## ClusterFamily & Community Services    ***
## ClusterFinance, Services & Innovation ***
## ClusterHealth                         ***
## ClusterIndustry                       ***
## ClusterJustice                        ***
## ClusterPlanning & Environment         ***
## ClusterPremier & Cabinet              ***
## ClusterTransport                      ***
## ClusterTreasury                       ***
## Year                                     
## GenderMale                            ***
## TypePart-Time                         ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2824 on 1601 degrees of freedom
## Multiple R-squared:  0.2197, Adjusted R-squared:  0.2139 
## F-statistic: 37.57 on 12 and 1601 DF,  p-value: < 2.2e-16
predict<- predict(model, train)


df_mod$Pred.Headcount <- predict(model,data.frame(Year=2025,Type=df_mod$Type, Gender=df_mod$Gender, Cluster=df_mod$Cluster))

Different insights of 2025

df_mod %>% 
  filter(Type=="Full-Time") %>% 
  ggplot()+
  geom_bar(aes(Cluster, Pred.Headcount, fill=Gender), stat = "identity", position = "dodge")+
  coord_flip()+
  labs(title = "Number of headcount in 2025 of Full-time workers ",
       y="Headcount")

df_mod %>% 
  filter(Type=="Part-Time") %>% 
  ggplot()+
  geom_bar(aes(Cluster, Pred.Headcount, fill=Gender), stat = "identity", position = "dodge")+
  coord_flip()+
  labs(title = "Number of headcount in 2025 of Part-Time workers ",
       y="Headcount")

#after neglecting negetive headcount
df_mod %>% 
    mutate(Pred.Headcount= if_else(Pred.Headcount<0, 0, Pred.Headcount)) %>% 
    filter(Type=="Part-Time") %>% 
  ggplot()+
  geom_bar(aes(Cluster, Pred.Headcount, fill=Gender), stat = "identity", position = "dodge")+
  coord_flip()+
  labs(title = "Number of headcount in 2025 of Part-Time workers ",
       y="Headcount")